{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Task \n",
    "\n",
    "This notebook prepares SHARE data. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 304,
   "metadata": {},
   "outputs": [],
   "source": [
    "share_w1 = '../data_sources/share/stata_sharew1_rel2-5-0__all_capi_modules/'\n",
    "share_w2 = '../data_sources/share/stata_sharew2_rel2-5-0__all_capi_modules/'\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Wave 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Demographics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 305,
   "metadata": {},
   "outputs": [],
   "source": [
    "dn = pd.read_stata(share_w1+'sharew1_rel2-5-0_dn.dta',convert_categoricals=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 306,
   "metadata": {},
   "outputs": [],
   "source": [
    "dn['rage_w1'] = 2004 - dn['dn003_']\n",
    "dn.loc[dn['rage_w1']>120,'rage_w1'] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 307,
   "metadata": {},
   "outputs": [],
   "source": [
    "dn['hhidpn'] = dn['hhid1'] + dn['cvid'].astype('str')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 308,
   "metadata": {},
   "outputs": [],
   "source": [
    "dn = dn.loc[:,['hhidpn','mergeid','country','rage_w1']]\n",
    "dn.set_index('hhidpn',inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 309,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "26.0     2\n",
       "28.0     1\n",
       "29.0     1\n",
       "30.0     1\n",
       "31.0     6\n",
       "        ..\n",
       "100.0    5\n",
       "101.0    1\n",
       "102.0    2\n",
       "103.0    1\n",
       "104.0    1\n",
       "Name: rage_w1, Length: 78, dtype: int64"
      ]
     },
     "execution_count": 309,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dn.rage_w1.value_counts().sort_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 310,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mergeid</th>\n",
       "      <th>rage_w1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>1893</td>\n",
       "      <td>1893</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>3008</td>\n",
       "      <td>3007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>3053</td>\n",
       "      <td>3053</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>2979</td>\n",
       "      <td>2972</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>2396</td>\n",
       "      <td>2396</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>2559</td>\n",
       "      <td>2559</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>3193</td>\n",
       "      <td>3193</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>1707</td>\n",
       "      <td>1707</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>2898</td>\n",
       "      <td>2898</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>1004</td>\n",
       "      <td>1003</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>3827</td>\n",
       "      <td>3827</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>2598</td>\n",
       "      <td>2595</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         mergeid  rage_w1\n",
       "country                  \n",
       "11          1893     1893\n",
       "12          3008     3007\n",
       "13          3053     3053\n",
       "14          2979     2972\n",
       "15          2396     2396\n",
       "16          2559     2559\n",
       "17          3193     3193\n",
       "18          1707     1707\n",
       "19          2898     2898\n",
       "20          1004     1003\n",
       "23          3827     3827\n",
       "25          2598     2595"
      ]
     },
     "execution_count": 310,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dn.groupby('country').count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 311,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "31115"
      ]
     },
     "execution_count": 311,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dn)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## BMI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 312,
   "metadata": {},
   "outputs": [],
   "source": [
    "gv = pd.read_stata(share_w1+'sharew1_rel2-5-0_gv_health.dta',convert_categoricals=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 313,
   "metadata": {},
   "outputs": [],
   "source": [
    "gv['hhidpn'] = gv['hhid1'] + gv['cvid'].astype('str')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 314,
   "metadata": {},
   "outputs": [],
   "source": [
    "gv = gv.rename({'bmi':'rbmi_w1'},axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 315,
   "metadata": {},
   "outputs": [],
   "source": [
    "gv = gv.loc[:,['hhidpn','rbmi_w1']]\n",
    "gv.set_index('hhidpn',inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 316,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rbmi_w1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hhidpn</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AT-000327-A1</th>\n",
       "      <td>32.787994</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-000327-A2</th>\n",
       "      <td>31.161373</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-001816-A1</th>\n",
       "      <td>23.722811</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-001816-A2</th>\n",
       "      <td>27.440599</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-002132-A1</th>\n",
       "      <td>34.049030</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                rbmi_w1\n",
       "hhidpn                 \n",
       "AT-000327-A1  32.787994\n",
       "AT-000327-A2  31.161373\n",
       "AT-001816-A1  23.722811\n",
       "AT-001816-A2  27.440599\n",
       "AT-002132-A1  34.049030"
      ]
     },
     "execution_count": 316,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gv.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 317,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "31115"
      ]
     },
     "execution_count": 317,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(gv)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Health Conditions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 318,
   "metadata": {},
   "outputs": [],
   "source": [
    "ph = pd.read_stata(share_w1+'sharew1_rel2-5-0_ph.dta',convert_categoricals=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 319,
   "metadata": {},
   "outputs": [],
   "source": [
    "ph['hhidpn'] = ph['hhid1'] + ph['cvid'].astype('str')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 320,
   "metadata": {},
   "outputs": [],
   "source": [
    "ph = ph.rename({'ph006d1':'rhearte_w1','ph006d2':'rhibpe_w1','ph006d5':'rdiabe_w1','ph006d10':'rcancre_w1','ph006d6':'rlunge_w1','ph006d4':'rstroke_w1'},axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 321,
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in ['ph049d1','ph049d2','ph049d3','ph049d4','ph049d5']:\n",
    "\tph[c] = np.where(ph[c]<0,np.nan,ph[c])\n",
    "ph['radla_w1'] = ph[['ph049d1','ph049d2','ph049d3','ph049d4','ph049d5']].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 322,
   "metadata": {},
   "outputs": [],
   "source": [
    "ph = ph.loc[:,['hhidpn','rhearte_w1','rhibpe_w1','rdiabe_w1','rcancre_w1','rlunge_w1','rstroke_w1','radla_w1']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 323,
   "metadata": {},
   "outputs": [],
   "source": [
    "ph.set_index('hhidpn',inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 324,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rhearte_w1</th>\n",
       "      <th>rhibpe_w1</th>\n",
       "      <th>rdiabe_w1</th>\n",
       "      <th>rcancre_w1</th>\n",
       "      <th>rlunge_w1</th>\n",
       "      <th>rstroke_w1</th>\n",
       "      <th>radla_w1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hhidpn</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AT-000327-A1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-000327-A2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-001816-A1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-001816-A2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-002132-A1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              rhearte_w1  rhibpe_w1  rdiabe_w1  rcancre_w1  rlunge_w1  \\\n",
       "hhidpn                                                                  \n",
       "AT-000327-A1         0.0        0.0        0.0         0.0        0.0   \n",
       "AT-000327-A2         0.0        0.0        0.0         0.0        0.0   \n",
       "AT-001816-A1         0.0        0.0        0.0         0.0        0.0   \n",
       "AT-001816-A2         0.0        0.0        0.0         0.0        0.0   \n",
       "AT-002132-A1         0.0        0.0        0.0         0.0        1.0   \n",
       "\n",
       "              rstroke_w1  radla_w1  \n",
       "hhidpn                              \n",
       "AT-000327-A1         0.0       0.0  \n",
       "AT-000327-A2         0.0       0.0  \n",
       "AT-001816-A1         0.0       0.0  \n",
       "AT-001816-A2         0.0       0.0  \n",
       "AT-002132-A1         0.0       2.0  "
      ]
     },
     "execution_count": 324,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ph.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 325,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "31115"
      ]
     },
     "execution_count": 325,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(ph)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Income and Self-reported Health"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 326,
   "metadata": {},
   "outputs": [],
   "source": [
    "im = pd.read_stata(share_w1+'sharew1_rel2-5-0_imputations.dta',convert_categoricals=False)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 327,
   "metadata": {},
   "outputs": [],
   "source": [
    "im = im.rename({'pppx2004':'pppx_w1'},axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 328,
   "metadata": {},
   "outputs": [],
   "source": [
    "im['hhidpn'] = im['hhid1'] + im['cvid'].astype('str')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 329,
   "metadata": {},
   "outputs": [],
   "source": [
    "im = im.rename({'hgtincv':'hitot_w1'},axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 330,
   "metadata": {},
   "outputs": [],
   "source": [
    "im = im.rename({'srhealtha':'rshlt_w1'},axis=1)\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 331,
   "metadata": {},
   "outputs": [],
   "source": [
    "im = im.loc[:,['hhidpn','hitot_w1','rshlt_w1','pppx_w1']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 332,
   "metadata": {},
   "outputs": [],
   "source": [
    "im.drop_duplicates(subset=['hhidpn'], keep='first',inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 333,
   "metadata": {},
   "outputs": [],
   "source": [
    "im.set_index('hhidpn',inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 334,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "32405"
      ]
     },
     "execution_count": 334,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(im)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 335,
   "metadata": {},
   "outputs": [],
   "source": [
    "wt = pd.read_stata(share_w1+'sharew1_rel2-5-0_gv_weights.dta',convert_categoricals=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 336,
   "metadata": {},
   "outputs": [],
   "source": [
    "wt['hhidpn'] = wt['hhid1'] + wt['cvid'].astype('str')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 337,
   "metadata": {},
   "outputs": [],
   "source": [
    "wt = wt.rename({'w1aci':'wgid_w1'},axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 338,
   "metadata": {},
   "outputs": [],
   "source": [
    "wt = wt.loc[:,['hhidpn','wgid_w1']]\n",
    "wt.set_index('hhidpn',inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Behavioral Risk Factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 339,
   "metadata": {},
   "outputs": [],
   "source": [
    "br = pd.read_stata(share_w1+'sharew1_rel2-5-0_br.dta',convert_categoricals=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 340,
   "metadata": {},
   "outputs": [],
   "source": [
    "br['hhidpn'] = br['hhid1'] + br['cvid'].astype('str')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 341,
   "metadata": {},
   "outputs": [],
   "source": [
    "br = br.rename({'br001_':'rsmokev_w1'},axis=1)\n",
    "br['rsmokev_w1'] = br['rsmokev_w1'].replace({5:0})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 342,
   "metadata": {},
   "outputs": [],
   "source": [
    "br = br.rename({'br015_':'rvgactx_w1'},axis=1)\n",
    "br.loc[br['rvgactx_w1']>4,'rvgactx_w1'] = np.nan\n",
    "br = br.rename({'br016_':'rmdactx_w1'},axis=1)\n",
    "br.loc[br['rmdactx_w1']>4,'rmdactx_w1'] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 343,
   "metadata": {},
   "outputs": [],
   "source": [
    "br = br.rename({'br010_':'rdrinkv_w1'},axis=1)\n",
    "br.loc[br['rdrinkv_w1']>7,'rdrinkv_w1'] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 344,
   "metadata": {},
   "outputs": [],
   "source": [
    "br = br.loc[:,['hhidpn','rsmokev_w1','rvgactx_w1','rmdactx_w1','rdrinkv_w1']]\n",
    "br.set_index('hhidpn',inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 345,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rsmokev_w1</th>\n",
       "      <th>rvgactx_w1</th>\n",
       "      <th>rmdactx_w1</th>\n",
       "      <th>rdrinkv_w1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hhidpn</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AT-000327-A1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-000327-A2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-001816-A1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-001816-A2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-002132-A1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              rsmokev_w1  rvgactx_w1  rmdactx_w1  rdrinkv_w1\n",
       "hhidpn                                                      \n",
       "AT-000327-A1         0.0         2.0         1.0         4.0\n",
       "AT-000327-A2         0.0         3.0         1.0         6.0\n",
       "AT-001816-A1         1.0         3.0         1.0         4.0\n",
       "AT-001816-A2         0.0         2.0         1.0         4.0\n",
       "AT-002132-A1         0.0         4.0         4.0        -1.0"
      ]
     },
     "execution_count": 345,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "br.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Merging"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 346,
   "metadata": {},
   "outputs": [],
   "source": [
    "sh_w1 = dn.merge(gv,left_index=True,right_index=True,how='left').merge(ph,left_index=True,right_index=True,how='left').merge(im,left_index=True,right_index=True,how='left').merge(wt,left_index=True,right_index=True,how='left').merge(br,left_index=True,right_index=True,how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 347,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mergeid</th>\n",
       "      <th>country</th>\n",
       "      <th>rage_w1</th>\n",
       "      <th>rbmi_w1</th>\n",
       "      <th>rhearte_w1</th>\n",
       "      <th>rhibpe_w1</th>\n",
       "      <th>rdiabe_w1</th>\n",
       "      <th>rcancre_w1</th>\n",
       "      <th>rlunge_w1</th>\n",
       "      <th>rstroke_w1</th>\n",
       "      <th>radla_w1</th>\n",
       "      <th>hitot_w1</th>\n",
       "      <th>rshlt_w1</th>\n",
       "      <th>pppx_w1</th>\n",
       "      <th>wgid_w1</th>\n",
       "      <th>rsmokev_w1</th>\n",
       "      <th>rvgactx_w1</th>\n",
       "      <th>rmdactx_w1</th>\n",
       "      <th>rdrinkv_w1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hhidpn</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AT-000327-A1</th>\n",
       "      <td>AT-000327-01</td>\n",
       "      <td>11</td>\n",
       "      <td>52.0</td>\n",
       "      <td>32.787994</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>27189.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.95925</td>\n",
       "      <td>1726.401367</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-000327-A2</th>\n",
       "      <td>AT-000327-02</td>\n",
       "      <td>11</td>\n",
       "      <td>49.0</td>\n",
       "      <td>31.161373</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>27189.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.95925</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-001816-A1</th>\n",
       "      <td>AT-001816-01</td>\n",
       "      <td>11</td>\n",
       "      <td>61.0</td>\n",
       "      <td>23.722811</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>275700.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.95925</td>\n",
       "      <td>999.335510</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-001816-A2</th>\n",
       "      <td>AT-001816-02</td>\n",
       "      <td>11</td>\n",
       "      <td>56.0</td>\n",
       "      <td>27.440599</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>275700.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.95925</td>\n",
       "      <td>1726.401367</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-002132-A1</th>\n",
       "      <td>AT-002132-01</td>\n",
       "      <td>11</td>\n",
       "      <td>71.0</td>\n",
       "      <td>34.049030</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>13583.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0.95925</td>\n",
       "      <td>1292.320679</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   mergeid  country  rage_w1    rbmi_w1  rhearte_w1  \\\n",
       "hhidpn                                                                \n",
       "AT-000327-A1  AT-000327-01       11     52.0  32.787994         0.0   \n",
       "AT-000327-A2  AT-000327-02       11     49.0  31.161373         0.0   \n",
       "AT-001816-A1  AT-001816-01       11     61.0  23.722811         0.0   \n",
       "AT-001816-A2  AT-001816-02       11     56.0  27.440599         0.0   \n",
       "AT-002132-A1  AT-002132-01       11     71.0  34.049030         0.0   \n",
       "\n",
       "              rhibpe_w1  rdiabe_w1  rcancre_w1  rlunge_w1  rstroke_w1  \\\n",
       "hhidpn                                                                  \n",
       "AT-000327-A1        0.0        0.0         0.0        0.0         0.0   \n",
       "AT-000327-A2        0.0        0.0         0.0        0.0         0.0   \n",
       "AT-001816-A1        0.0        0.0         0.0        0.0         0.0   \n",
       "AT-001816-A2        0.0        0.0         0.0        0.0         0.0   \n",
       "AT-002132-A1        0.0        0.0         0.0        1.0         0.0   \n",
       "\n",
       "              radla_w1  hitot_w1  rshlt_w1  pppx_w1      wgid_w1  rsmokev_w1  \\\n",
       "hhidpn                                                                         \n",
       "AT-000327-A1       0.0   27189.0       1.0  0.95925  1726.401367         0.0   \n",
       "AT-000327-A2       0.0   27189.0       1.0  0.95925          NaN         0.0   \n",
       "AT-001816-A1       0.0  275700.0       2.0  0.95925   999.335510         1.0   \n",
       "AT-001816-A2       0.0  275700.0       2.0  0.95925  1726.401367         0.0   \n",
       "AT-002132-A1       2.0   13583.0       5.0  0.95925  1292.320679         0.0   \n",
       "\n",
       "              rvgactx_w1  rmdactx_w1  rdrinkv_w1  \n",
       "hhidpn                                            \n",
       "AT-000327-A1         2.0         1.0         4.0  \n",
       "AT-000327-A2         3.0         1.0         6.0  \n",
       "AT-001816-A1         3.0         1.0         4.0  \n",
       "AT-001816-A2         2.0         1.0         4.0  \n",
       "AT-002132-A1         4.0         4.0        -1.0  "
      ]
     },
     "execution_count": 347,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sh_w1.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 348,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['mergeid',\n",
       " 'country',\n",
       " 'rage_w1',\n",
       " 'rbmi_w1',\n",
       " 'rhearte_w1',\n",
       " 'rhibpe_w1',\n",
       " 'rdiabe_w1',\n",
       " 'rcancre_w1',\n",
       " 'rlunge_w1',\n",
       " 'rstroke_w1',\n",
       " 'radla_w1',\n",
       " 'hitot_w1',\n",
       " 'rshlt_w1',\n",
       " 'pppx_w1',\n",
       " 'wgid_w1',\n",
       " 'rsmokev_w1',\n",
       " 'rvgactx_w1',\n",
       " 'rmdactx_w1',\n",
       " 'rdrinkv_w1']"
      ]
     },
     "execution_count": 348,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sh_w1.columns.to_list()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 349,
   "metadata": {},
   "outputs": [],
   "source": [
    "sh_w1.to_pickle('../data_sources/share/share_w1.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 350,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "31115"
      ]
     },
     "execution_count": 350,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(sh_w1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Wave 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Demographics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 351,
   "metadata": {},
   "outputs": [],
   "source": [
    "dn = pd.read_stata(share_w2+'sharew2_rel2-5-0_dn.dta',convert_categoricals=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 352,
   "metadata": {},
   "outputs": [],
   "source": [
    "dn['rage_w2'] = 2006 - dn['dn003_']\n",
    "dn.loc[dn['rage_w2']>120,'rage_w2'] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 353,
   "metadata": {},
   "outputs": [],
   "source": [
    "dn['hhidpn'] = dn['hhid2'] + dn['cvid'].astype('str')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 354,
   "metadata": {},
   "outputs": [],
   "source": [
    "dn = dn.loc[:,['hhidpn','mergeid','country','rage_w2']]\n",
    "dn.set_index('hhidpn',inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## BMI\n",
    "\n",
    "We will keep BMI in wave 2 but note missing for most because height was not asked again. In paper, only use wave 1 to define obesity. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 355,
   "metadata": {},
   "outputs": [],
   "source": [
    "gv = pd.read_stata(share_w2+'sharew2_rel2-5-0_gv_health.dta',convert_categoricals=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 356,
   "metadata": {},
   "outputs": [],
   "source": [
    "gv['hhidpn'] = gv['hhid2'] + gv['cvid'].astype('str')\n",
    "gv = gv.rename({'bmi':'rbmi_w2'},axis=1)\n",
    "gv = gv.loc[:,['hhidpn','rbmi_w2']]\n",
    "gv.set_index('hhidpn',inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 357,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rbmi_w2</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hhidpn</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AT-000327-A1</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-000327-A2</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-001816-A1</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-002132-A1</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-004234-A2</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              rbmi_w2\n",
       "hhidpn               \n",
       "AT-000327-A1      NaN\n",
       "AT-000327-A2      NaN\n",
       "AT-001816-A1      NaN\n",
       "AT-002132-A1      NaN\n",
       "AT-004234-A2      NaN"
      ]
     },
     "execution_count": 357,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gv.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Health Conditions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 358,
   "metadata": {},
   "outputs": [],
   "source": [
    "ph = pd.read_stata(share_w2+'sharew2_rel2-5-0_ph.dta',convert_categoricals=False)\n",
    "ph['hhidpn'] = ph['hhid2'] + ph['cvid'].astype('str')\n",
    "ph = ph.rename({'ph006d1':'rhearte_w2','ph006d2':'rhibpe_w2','ph006d5':'rdiabe_w2','ph006d10':'rcancre_w2','ph006d6':'rlunge_w2','ph006d4':'rstroke_w2'},axis=1)\n",
    "for c in ['ph049d1','ph049d2','ph049d3','ph049d4','ph049d5']:\n",
    "\tph[c] = np.where(ph[c]<0,np.nan,ph[c])\n",
    "ph['radla_w2'] = ph[['ph049d1','ph049d2','ph049d3','ph049d4','ph049d5']].sum(axis=1)\n",
    "ph = ph.loc[:,['hhidpn','rhearte_w2','rhibpe_w2','rdiabe_w2','rcancre_w2','rlunge_w2','rstroke_w2','radla_w2']]\n",
    "ph.set_index('hhidpn',inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 359,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rhearte_w2</th>\n",
       "      <th>rhibpe_w2</th>\n",
       "      <th>rdiabe_w2</th>\n",
       "      <th>rcancre_w2</th>\n",
       "      <th>rlunge_w2</th>\n",
       "      <th>rstroke_w2</th>\n",
       "      <th>radla_w2</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hhidpn</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AT-000327-A1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-000327-A2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-001816-A1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-002132-A1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-004234-A2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              rhearte_w2  rhibpe_w2  rdiabe_w2  rcancre_w2  rlunge_w2  \\\n",
       "hhidpn                                                                  \n",
       "AT-000327-A1         0.0        0.0        0.0         1.0        0.0   \n",
       "AT-000327-A2         0.0        1.0        0.0         0.0        0.0   \n",
       "AT-001816-A1         0.0        1.0        0.0         0.0        0.0   \n",
       "AT-002132-A1         0.0        0.0        0.0         0.0        1.0   \n",
       "AT-004234-A2         0.0        0.0        0.0         0.0        0.0   \n",
       "\n",
       "              rstroke_w2  radla_w2  \n",
       "hhidpn                              \n",
       "AT-000327-A1         0.0       0.0  \n",
       "AT-000327-A2         0.0       0.0  \n",
       "AT-001816-A1         0.0       0.0  \n",
       "AT-002132-A1         0.0       5.0  \n",
       "AT-004234-A2         0.0       0.0  "
      ]
     },
     "execution_count": 359,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ph.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Income and Self-reported Health"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 360,
   "metadata": {},
   "outputs": [],
   "source": [
    "im = pd.read_stata(share_w2+'sharew2_rel2-5-0_imputations.dta',convert_categoricals=False)\n",
    "im['hhidpn'] = im['hhid2'] + im['cvid'].astype('str')\n",
    "im = im.rename({'hgtincv':'hitot_w2'},axis=1)\n",
    "im = im.rename({'pppx2006':'pppx_w2'},axis=1)\n",
    "im = im.rename({'srhealtha':'rshlt_w2'},axis=1)\n",
    "im = im.loc[:,['hhidpn','hitot_w2','rshlt_w2','pppx_w2']]\n",
    "im.drop_duplicates(subset=['hhidpn'], keep='first',inplace=True)\n",
    "im.set_index('hhidpn',inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 361,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>hitot_w2</th>\n",
       "      <th>rshlt_w2</th>\n",
       "      <th>pppx_w2</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hhidpn</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AT-000327-A1</th>\n",
       "      <td>25300</td>\n",
       "      <td>5</td>\n",
       "      <td>0.99545</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-000327-A2</th>\n",
       "      <td>25300</td>\n",
       "      <td>2</td>\n",
       "      <td>0.99545</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-001816-A1</th>\n",
       "      <td>57008</td>\n",
       "      <td>3</td>\n",
       "      <td>0.99545</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-002132-A1</th>\n",
       "      <td>14300</td>\n",
       "      <td>5</td>\n",
       "      <td>0.99545</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-004234-A2</th>\n",
       "      <td>23620</td>\n",
       "      <td>4</td>\n",
       "      <td>0.99545</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              hitot_w2  rshlt_w2  pppx_w2\n",
       "hhidpn                                   \n",
       "AT-000327-A1     25300         5  0.99545\n",
       "AT-000327-A2     25300         2  0.99545\n",
       "AT-001816-A1     57008         3  0.99545\n",
       "AT-002132-A1     14300         5  0.99545\n",
       "AT-004234-A2     23620         4  0.99545"
      ]
     },
     "execution_count": 361,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "im.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 362,
   "metadata": {},
   "outputs": [],
   "source": [
    "wt = pd.read_stata(share_w2+'sharew2_rel2-5-0_gv_weights.dta',convert_categoricals=False)\n",
    "wt['hhidpn'] = wt['hhid2'] + wt['cvid'].astype('str')\n",
    "wt = wt.rename({'w2aci':'wgid_w2'},axis=1)\n",
    "wt = wt.loc[:,['hhidpn','wgid_w2']]\n",
    "wt.set_index('hhidpn',inplace=True)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Behavioral risk factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 363,
   "metadata": {},
   "outputs": [],
   "source": [
    "br = pd.read_stata(share_w2+'sharew2_rel2-5-0_br.dta',convert_categoricals=False)\n",
    "br['hhidpn'] = br['hhid2'] + br['cvid'].astype('str')\n",
    "br = br.rename({'br001_':'rsmokev_w2'},axis=1)\n",
    "br['rsmokev_w2'] = br['rsmokev_w2'].replace({5:0})\n",
    "br = br.rename({'br015_':'rvgactx_w2'},axis=1)\n",
    "br.loc[br['rvgactx_w2']>4,'rvgactx_w2'] = np.nan\n",
    "br = br.rename({'br016_':'rmdactx_w2'},axis=1)\n",
    "br.loc[br['rmdactx_w2']>4,'rmdactx_w2'] = np.nan\n",
    "br = br.rename({'br010_':'rdrinkv_w2'},axis=1)\n",
    "br.loc[br['rdrinkv_w2']>7,'rdrinkv_w2'] = np.nan\n",
    "br = br.rename({'br019_':'rdrinkn_w2'},axis=1)\n",
    "br = br.loc[:,['hhidpn','rsmokev_w2','rvgactx_w2','rmdactx_w2','rdrinkv_w2','rdrinkn_w2']]\n",
    "br.set_index('hhidpn',inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Merging"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 364,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mergeid</th>\n",
       "      <th>rage_w2</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>1341</td>\n",
       "      <td>1341</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>2568</td>\n",
       "      <td>2568</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>2745</td>\n",
       "      <td>2745</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>2661</td>\n",
       "      <td>2661</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>2228</td>\n",
       "      <td>2228</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>2983</td>\n",
       "      <td>2983</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>2968</td>\n",
       "      <td>2968</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>2616</td>\n",
       "      <td>2616</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>3243</td>\n",
       "      <td>3242</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>1462</td>\n",
       "      <td>1462</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>3169</td>\n",
       "      <td>3169</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>2830</td>\n",
       "      <td>2830</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>2467</td>\n",
       "      <td>2467</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>1134</td>\n",
       "      <td>1134</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         mergeid  rage_w2\n",
       "country                  \n",
       "11          1341     1341\n",
       "12          2568     2568\n",
       "13          2745     2745\n",
       "14          2661     2661\n",
       "15          2228     2228\n",
       "16          2983     2983\n",
       "17          2968     2968\n",
       "18          2616     2616\n",
       "19          3243     3242\n",
       "20          1462     1462\n",
       "23          3169     3169\n",
       "28          2830     2830\n",
       "29          2467     2467\n",
       "30          1134     1134"
      ]
     },
     "execution_count": 364,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dn.groupby('country').count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 365,
   "metadata": {},
   "outputs": [],
   "source": [
    "sh_w2 = dn.merge(gv,left_index=True,right_index=True,how='left').merge(ph,left_index=True,right_index=True,how='left').merge(im,left_index=True,right_index=True,how='left').merge(wt,left_index=True,right_index=True,how='left').merge(br,left_index=True,right_index=True,how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 366,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mergeid</th>\n",
       "      <th>country</th>\n",
       "      <th>rage_w2</th>\n",
       "      <th>rbmi_w2</th>\n",
       "      <th>rhearte_w2</th>\n",
       "      <th>rhibpe_w2</th>\n",
       "      <th>rdiabe_w2</th>\n",
       "      <th>rcancre_w2</th>\n",
       "      <th>rlunge_w2</th>\n",
       "      <th>rstroke_w2</th>\n",
       "      <th>radla_w2</th>\n",
       "      <th>hitot_w2</th>\n",
       "      <th>rshlt_w2</th>\n",
       "      <th>pppx_w2</th>\n",
       "      <th>wgid_w2</th>\n",
       "      <th>rsmokev_w2</th>\n",
       "      <th>rvgactx_w2</th>\n",
       "      <th>rmdactx_w2</th>\n",
       "      <th>rdrinkv_w2</th>\n",
       "      <th>rdrinkn_w2</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hhidpn</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AT-000327-A1</th>\n",
       "      <td>AT-000327-01</td>\n",
       "      <td>11</td>\n",
       "      <td>54.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>25300.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0.99545</td>\n",
       "      <td>3402.772949</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>7.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-000327-A2</th>\n",
       "      <td>AT-000327-02</td>\n",
       "      <td>11</td>\n",
       "      <td>51.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>25300.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.99545</td>\n",
       "      <td>2391.937988</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>7.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-001816-A1</th>\n",
       "      <td>AT-001816-02</td>\n",
       "      <td>11</td>\n",
       "      <td>58.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>57008.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0.99545</td>\n",
       "      <td>3402.772949</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-002132-A1</th>\n",
       "      <td>AT-002132-01</td>\n",
       "      <td>11</td>\n",
       "      <td>73.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>14300.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0.99545</td>\n",
       "      <td>1778.268433</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>7.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AT-004234-A2</th>\n",
       "      <td>AT-004234-01</td>\n",
       "      <td>11</td>\n",
       "      <td>56.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>23620.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.99545</td>\n",
       "      <td>3548.626221</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   mergeid  country  rage_w2  rbmi_w2  rhearte_w2  rhibpe_w2  \\\n",
       "hhidpn                                                                         \n",
       "AT-000327-A1  AT-000327-01       11     54.0      NaN         0.0        0.0   \n",
       "AT-000327-A2  AT-000327-02       11     51.0      NaN         0.0        1.0   \n",
       "AT-001816-A1  AT-001816-02       11     58.0      NaN         0.0        1.0   \n",
       "AT-002132-A1  AT-002132-01       11     73.0      NaN         0.0        0.0   \n",
       "AT-004234-A2  AT-004234-01       11     56.0      NaN         0.0        0.0   \n",
       "\n",
       "              rdiabe_w2  rcancre_w2  rlunge_w2  rstroke_w2  radla_w2  \\\n",
       "hhidpn                                                                 \n",
       "AT-000327-A1        0.0         1.0        0.0         0.0       0.0   \n",
       "AT-000327-A2        0.0         0.0        0.0         0.0       0.0   \n",
       "AT-001816-A1        0.0         0.0        0.0         0.0       0.0   \n",
       "AT-002132-A1        0.0         0.0        1.0         0.0       5.0   \n",
       "AT-004234-A2        0.0         0.0        0.0         0.0       0.0   \n",
       "\n",
       "              hitot_w2  rshlt_w2  pppx_w2      wgid_w2  rsmokev_w2  \\\n",
       "hhidpn                                                               \n",
       "AT-000327-A1   25300.0       5.0  0.99545  3402.772949         NaN   \n",
       "AT-000327-A2   25300.0       2.0  0.99545  2391.937988         NaN   \n",
       "AT-001816-A1   57008.0       3.0  0.99545  3402.772949         NaN   \n",
       "AT-002132-A1   14300.0       5.0  0.99545  1778.268433         NaN   \n",
       "AT-004234-A2   23620.0       4.0  0.99545  3548.626221         NaN   \n",
       "\n",
       "              rvgactx_w2  rmdactx_w2  rdrinkv_w2  rdrinkn_w2  \n",
       "hhidpn                                                        \n",
       "AT-000327-A1         4.0         4.0         7.0         NaN  \n",
       "AT-000327-A2         1.0         1.0         7.0         NaN  \n",
       "AT-001816-A1         1.0         2.0         3.0         1.0  \n",
       "AT-002132-A1         4.0         4.0         7.0         NaN  \n",
       "AT-004234-A2         3.0         1.0         4.0         1.0  "
      ]
     },
     "execution_count": 366,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sh_w2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 367,
   "metadata": {},
   "outputs": [],
   "source": [
    "sh_w2.to_pickle('../data_sources/share/share_w2.pkl')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Creating Wide Form SHARE dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 368,
   "metadata": {},
   "outputs": [],
   "source": [
    "sh_w1 = sh_w1.reset_index()\n",
    "sh_w1.set_index('mergeid',inplace=True)\n",
    "sh_w2 = sh_w2.reset_index()\n",
    "sh_w2.set_index('mergeid',inplace=True)\n",
    "sh = sh_w1.merge(sh_w2,left_index=True,right_index=True,how='left')\n",
    "sh = sh.rename({'hhidpn_x':'hhidpn_w1','hhidpn_y':'hhidpn_w2','country_x':'country'},axis=1)\n",
    "sh = sh.drop('country_y',axis=1)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Adjust to euros using PPPs in SHARE and then express in US dollars. Exchange rate in 2004 is 1.24 dollar per euro. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 369,
   "metadata": {},
   "outputs": [],
   "source": [
    "sh['hitot_w1'] = sh['hitot_w1']/sh['pppx_w1']*1.24\n",
    "sh['hitot_w2'] = sh['hitot_w2']/sh['pppx_w2']*1.24\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 370,
   "metadata": {},
   "outputs": [],
   "source": [
    "sh.to_pickle('../data_sources/share/share_wide_select.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "cf2a50979671a58939829e6829efb726aa5da11149213b77bd50351f899d04fb"
  },
  "kernelspec": {
   "display_name": "Python 3.8.5 64-bit ('base': conda)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
